import os
import pandas as pd
import numpy as np
import plotly_express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import warnings
warnings.filterwarnings('ignore')
from lightgbm import LGBMRegressor
import joblib
import numpy as np # array, vector, matrix calculations
import pandas as pd # DataFrame handling
import shap # for consistent, signed variable importance measurements
import xgboost as xgb # gradient boosting machines (GBMs)
import matplotlib.pyplot as plt # plotting
pd.options.display.max_columns = 999 # enable display of all columns in notebook
# enables display of plots in notebook
%matplotlib inline
np.random.seed(12345)
sales = pd.read_csv('sales_train_evaluation.csv')
sales.name = 'sales'
calendar = pd.read_csv('calendar.csv')
calendar.name = 'calendar'
prices = pd.read_csv('sell_prices.csv')
prices.name = 'prices'
for d in range(1942,1970):
col = 'd_' + str(d)
sales[col] = 0
sales[col] = sales[col].astype(np.int16) #add sales infoemation from d1942 to d1969 which set as 0
sales_bd = np.round(sales.memory_usage().sum()/(1024*1024),1) #calculate the total memory used before downcast
calendar_bd = np.round(calendar.memory_usage().sum()/(1024*1024),1)
prices_bd = np.round(prices.memory_usage().sum()/(1024*1024),1)
#Downcast in order to save memory
def downcast(df):
cols = df.dtypes.index.tolist()
types = df.dtypes.values.tolist()
for i,t in enumerate(types):
if 'int' in str(t):
if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
df[cols[i]] = df[cols[i]].astype(np.int8)
elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
df[cols[i]] = df[cols[i]].astype(np.int16)
elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
df[cols[i]] = df[cols[i]].astype(np.int32)
else:
df[cols[i]] = df[cols[i]].astype(np.int64)
elif 'float' in str(t):
if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
df[cols[i]] = df[cols[i]].astype(np.float16)
elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
df[cols[i]] = df[cols[i]].astype(np.float32)
else:
df[cols[i]] = df[cols[i]].astype(np.float64)
elif t == np.object:
if cols[i] == 'date':
df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
else:
df[cols[i]] = df[cols[i]].astype('category')
return df
sales = downcast(sales)
prices = downcast(prices)
calendar = downcast(calendar)
sales_ad = np.round(sales.memory_usage().sum()/(1024*1024),1) #calculate the total memory used after downcast
calendar_ad = np.round(calendar.memory_usage().sum()/(1024*1024),1)
prices_ad = np.round(prices.memory_usage().sum()/(1024*1024),1)
dic = {'DataFrame':['sales','calendar','prices'],
'Before downcasting':[sales_bd,calendar_bd,prices_bd],
'After downcasting':[sales_ad,calendar_ad,prices_ad]}
memory = pd.DataFrame(dic)
memory = pd.melt(memory, id_vars='DataFrame', var_name='Status', value_name='Memory (MB)')
memory.sort_values('Memory (MB)',inplace=True)
fig = px.bar(memory, x='DataFrame', y='Memory (MB)', color='Status', barmode='group', text='Memory (MB)')
fig.update_traces(texttemplate='%{text} MB', textposition='outside')
fig.update_layout(template='seaborn', title='Effect of Downcasting')
fig.show() #plot the effect of downcasting, from the plot we can see that downcasting works well
df = pd.melt(sales, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='d', value_name='sold').dropna()
df = pd.merge(df, calendar, on='d', how='left')
df = pd.merge(df, prices, on=['store_id','item_id','wm_yr_wk'], how='left') #merge the three table together
#remove the information of 12-25 of each year, because Christmas Day of each year are outliers and test data doesn't include Chrismas Day
df=df[-(df['date']== "2011-12-25")&-(df['date']== "2012-12-25")&-(df['date']== "2013-12-25")&-(df['date']== "2014-12-25")&-(df['date']== "2015-12-25")]
group = sales.groupby(['state_id','store_id','cat_id','dept_id'],as_index=False)['item_id'].count().dropna()
group['USA'] = 'United States of America'
group.rename(columns={'state_id':'State','store_id':'Store','cat_id':'Category','dept_id':'Department','item_id':'Count'},inplace=True)
group_price_store = df.groupby(['state_id','store_id','item_id'],as_index=False)['sell_price'].mean().dropna()
group_price_cat = df.groupby(['store_id','cat_id','item_id'],as_index=False)['sell_price'].mean().dropna()
group = df.groupby(['year','date','state_id','store_id'], as_index=False)['sold'].sum().dropna() # merge data and dropna
fig = go.Figure()
title = 'Items sold over time'
years = group.year.unique().tolist()
buttons = []
y=3
for state in group.state_id.unique().tolist():
group_state = group[group['state_id']==state]
for store in group_state.store_id.unique().tolist():
group_state_store = group_state[group_state['store_id']==store]
fig.add_trace(go.Scatter(name=store, x=group_state_store['date'], y=group_state_store['sold'], showlegend=True,
yaxis='y'+str(y) if y!=1 else 'y'))
y-=1
fig.update_layout(
xaxis=dict(
#autorange=True,
range = ['2011-01-29','2016-05-22'],
rangeselector=dict(
buttons=list([
dict(count=1,
label="1m",
step="month",
stepmode="backward"),
dict(count=6,
label="6m",
step="month",
stepmode="backward"),
dict(count=1,
label="YTD",
step="year",
stepmode="todate"),
dict(count=1,
label="1y",
step="year",
stepmode="backward"),
dict(count=2,
label="2y",
step="year",
stepmode="backward"),
dict(count=3,
label="3y",
step="year",
stepmode="backward"),
dict(count=4,
label="4y",
step="year",
stepmode="backward"),
dict(step="all")
])
),
rangeslider=dict(
autorange=True,
),
type="date"
),
yaxis=dict(
anchor="x",
autorange=True,
domain=[0, 0.33],
mirror=True,
showline=True,
side="left",
tickfont={"size":10},
tickmode="auto",
ticks="",
title='WI',
titlefont={"size":20},
type="linear",
zeroline=False
),
yaxis2=dict(
anchor="x",
autorange=True,
domain=[0.33, 0.66],
mirror=True,
showline=True,
side="left",
tickfont={"size":10},
tickmode="auto",
ticks="",
title = 'TX',
titlefont={"size":20},
type="linear",
zeroline=False
),
yaxis3=dict(
anchor="x",
autorange=True,
domain=[0.66, 1],
mirror=True,
showline=True,
side="left",
tickfont={"size":10},
tickmode="auto",
ticks='',
title="CA",
titlefont={"size":20},
type="linear",
zeroline=False
)
)
fig.update_layout(template='seaborn', title=title)
fig.show()
#This plot show the items sold by different period and the trends are stable and don't change much by time which means
#Date may not have a great effect on finall result
df['revenue'] = df['sold']*df['sell_price'].astype(np.float32) #create revenue column
def introduce_nulls(df):
idx = pd.date_range(df.date.dt.date.min(), df.date.dt.date.max())
df = df.set_index('date')
df = df.reindex(idx)
df.reset_index(inplace=True)
df.rename(columns={'index':'date'},inplace=True)
return df
def plot_metric(df,state,store,metric):
store_sales = df[(df['state_id']==state)&(df['store_id']==store)&(df['date']<='2016-05-22')]
food_sales = store_sales[store_sales['cat_id']=='FOODS']
store_sales = store_sales.groupby(['date','snap_'+state],as_index=False)['sold','revenue'].sum()
snap_sales = store_sales[store_sales['snap_'+state]==1]
non_snap_sales = store_sales[store_sales['snap_'+state]==0]
food_sales = food_sales.groupby(['date','snap_'+state],as_index=False)['sold','revenue'].sum()
snap_foods = food_sales[food_sales['snap_'+state]==1]
non_snap_foods = food_sales[food_sales['snap_'+state]==0]
non_snap_sales = introduce_nulls(non_snap_sales)
snap_sales = introduce_nulls(snap_sales)
non_snap_foods = introduce_nulls(non_snap_foods)
snap_foods = introduce_nulls(snap_foods)
fig = go.Figure()
fig.add_trace(go.Scatter(x=non_snap_sales['date'],y=non_snap_sales[metric],
name='Total '+metric+'(Non-SNAP)'))
fig.add_trace(go.Scatter(x=snap_sales['date'],y=snap_sales[metric],
name='Total '+metric+'(SNAP)'))
fig.add_trace(go.Scatter(x=non_snap_foods['date'],y=non_snap_foods[metric],
name='Food '+metric+'(Non-SNAP)'))
fig.add_trace(go.Scatter(x=snap_foods['date'],y=snap_foods[metric],
name='Food '+metric+'(SNAP)'))
fig.update_yaxes(title_text='Total items sold' if metric=='sold' else 'Total revenue($)')
fig.update_layout(template='seaborn',title=store)
fig.update_layout(
xaxis=dict(
#autorange=True,
range = ['2011-01-29','2016-05-22'],
rangeselector=dict(
buttons=list([
dict(count=1,
label="1m",
step="month",
stepmode="backward"),
dict(count=6,
label="6m",
step="month",
stepmode="backward"),
dict(count=1,
label="YTD",
step="year",
stepmode="todate"),
dict(count=1,
label="1y",
step="year",
stepmode="backward"),
dict(count=2,
label="2y",
step="year",
stepmode="backward"),
dict(count=3,
label="3y",
step="year",
stepmode="backward"),
dict(count=4,
label="4y",
step="year",
stepmode="backward"),
dict(step="all")
])
),
rangeslider=dict(
autorange=True,
),
type="date"
))
return fig
cal_data = group.copy()
cal_data = cal_data[cal_data.date <= '22-05-2016']
cal_data['week'] = cal_data.date.dt.weekofyear
cal_data['day_name'] = cal_data.date.dt.day_name()
def calmap(cal_data, state, store, scale):
cal_data = cal_data[(cal_data['state_id']==state)&(cal_data['store_id']==store)]
years = cal_data.year.unique().tolist()
fig = make_subplots(rows=len(years),cols=1,shared_xaxes=True,vertical_spacing=0.005)
r=1
for year in years:
data = cal_data[cal_data['year']==year]
data = introduce_nulls(data)
fig.add_trace(go.Heatmap(
z=data.sold,
x=data.week,
y=data.day_name,
hovertext=data.date.dt.date,
coloraxis = "coloraxis",name=year,
),r,1)
fig.update_yaxes(title_text=year,tickfont=dict(size=5),row = r,col = 1)
r+=1
fig.update_xaxes(range=[1,53],tickfont=dict(size=10), nticks=53)
fig.update_layout(coloraxis = {'colorscale':scale})
fig.update_layout(template='seaborn', title=store)
return fig
d_id = dict(zip(df.id.cat.codes, df.id))
d_item_id = dict(zip(df.item_id.cat.codes, df.item_id))
d_dept_id = dict(zip(df.dept_id.cat.codes, df.dept_id))
d_cat_id = dict(zip(df.cat_id.cat.codes, df.cat_id))
d_store_id = dict(zip(df.store_id.cat.codes, df.store_id))
d_state_id = dict(zip(df.state_id.cat.codes, df.state_id))
del group, group_price_cat, group_price_store, group_state, group_state_store, cal_data
gc.collect();
#2
df.d = df['d'].apply(lambda x: x.split('_')[1]).astype(np.int16)
cols = df.dtypes.index.tolist()
types = df.dtypes.values.tolist()
for i,type in enumerate(types):
if type.name == 'category':
df[cols[i]] = df[cols[i]].cat.codes
#3
df.drop('date',axis=1,inplace=True)
lags = [1,2,3,6,12,24,36]
for lag in lags:
df['sold_lag_'+str(lag)] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],as_index=False)['sold'].shift(lag).astype(np.float16)
df['iteam_sold_avg'] = df.groupby('item_id')['sold'].transform('mean').astype(np.float16)
df['state_sold_avg'] = df.groupby('state_id')['sold'].transform('mean').astype(np.float16)
df['store_sold_avg'] = df.groupby('store_id')['sold'].transform('mean').astype(np.float16)
df['cat_sold_avg'] = df.groupby('cat_id')['sold'].transform('mean').astype(np.float16)
df['dept_sold_avg'] = df.groupby('dept_id')['sold'].transform('mean').astype(np.float16)
df['cat_dept_sold_avg'] = df.groupby(['cat_id','dept_id'])['sold'].transform('mean').astype(np.float16)
df['store_item_sold_avg'] = df.groupby(['store_id','item_id'])['sold'].transform('mean').astype(np.float16)
df['cat_item_sold_avg'] = df.groupby(['cat_id','item_id'])['sold'].transform('mean').astype(np.float16)
df['dept_item_sold_avg'] = df.groupby(['dept_id','item_id'])['sold'].transform('mean').astype(np.float16)
df['state_store_sold_avg'] = df.groupby(['state_id','store_id'])['sold'].transform('mean').astype(np.float16)
df['state_store_cat_sold_avg'] = df.groupby(['state_id','store_id','cat_id'])['sold'].transform('mean').astype(np.float16)
df['store_cat_dept_sold_avg'] = df.groupby(['store_id','cat_id','dept_id'])['sold'].transform('mean').astype(np.float16)
df['rolling_sold_mean'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform(lambda x: x.rolling(window=7).mean()).astype(np.float16)
df['expanding_sold_mean'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform(lambda x: x.expanding(2).mean()).astype(np.float16)
df['daily_avg_sold'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id','d'])['sold'].transform('mean').astype(np.float16)
df['avg_sold'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform('mean').astype(np.float16)
df['selling_trend'] = (df['daily_avg_sold'] - df['avg_sold']).astype(np.float16)
df.drop(['daily_avg_sold','avg_sold'],axis=1,inplace=True)
df = df[df['d']>=36]
df #finall dataset used to build model
df.to_pickle('data.pkl')
del df
gc.collect(); #save data into data.pkl
df = pd.read_pickle('data.pkl')
df
data = df
valid1 = data[(data['d']>=1914) & (data['d']<1942)][['id','d','sold']]
test1 = data[data['d']>=1942][['id','d','sold']]
eval_preds = test1['sold']
valid_preds = valid1['sold'] #split data into train valid and test
df
y = 'sold'
X = [name for name in df.columns if name not in [y,'id']]
print('y =', y)
print('X =', X) #extract indexes
stores = sales.store_id.cat.codes.unique().tolist()
#because of my computer doesn't have enough memory to load the whole data, so I decide to create 10 models for each 10
#stores separately, and use each model to predict and merge the final result.
for store in stores:
df = data[data['store_id']==store]
train = df[df['d']<1914]
valid = df[(df['d']>=1914) & (df['d']<1942)]
test=df[df['d']>=1942]
dtrain = xgb.DMatrix(train[X], train[y])
dtest = xgb.DMatrix(test[X], test[y])
dvaild=xgb.DMatrix(valid[X], valid[y])
params = {
'booster': 'gbtree', # base learner will be decision tree
'eval_metric': 'rmse', # stop training based on maximum AUC, AUC always between 0-1
'eta': 0.08, # learning rate
'subsample': 0.8, # use 80% of rows in each decision tree
'colsample_bytree': 0.8, # use 80% of columns in each decision tree
'max_depth': 8, # allow decision trees to grow to depth of 8
'seed': 12345 # set random seed for reproducibility
}
watchlist = [(dtrain, 'train'), (dvaild, 'eval')]
xgb_model = xgb.train(params, # set tuning parameters from above
dtrain, # training data
1000, # maximum of 1000 iterations (trees)
evals=watchlist, # use watchlist for early stopping
early_stopping_rounds=15, # stop after 15 iterations (trees) without increase in AUC
verbose_eval=5)
eval_preds[test.index]= xgb_model.predict(dtest)
valid_preds[valid.index] = xgb_model.predict(dvaild)
#valid_preds[X_valid.index] = model.predict(X_valid)
#eval_preds[X_test.index] = model.predict(X_test)
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], valid1[xgb_model.feature_names])
filename = 'xgbmodel'+str(d_store_id[store])+'.pkl'
# save model
joblib.dump(xgb_model, filename)
del xgb_model, train, valid, test #, y_valid
gc.collect()
actual = False
if actual == False:
#Get the validation results(We already have them as less than one month left for competition to end)
validation = sales[['id']+['d_' + str(i) for i in range(1914,1942)]]
validation['id']=pd.read_csv('sales_train_validation.csv').id
validation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
else:
#Get the actual validation results
valid['sold'] = valid_preds
validation = valid[['id','d','sold']]
validation = pd.pivot(validation, index='id', columns='d', values='sold').reset_index()
validation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
validation.id = validation.id.map(d_id).str.replace('evaluation','validation')
#creat validation set to be submitted
validation
actual = False
if actual == False:
#Get the validation results(We already have them as less than one month left for competition to end)
validation = sales[['id']+['d_' + str(i) for i in range(1914,1942)]]
validation['id']=pd.read_csv('sales_train_validation.csv').id
validation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
else:
#Get the actual validation results
valid1['sold'] = valid_preds
validation = valid1[['id','d','sold']]
validation = pd.pivot(validation, index='id', columns='d', values='sold').reset_index()
validation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
validation.id = validation.id.map(d_id).str.replace('evaluation','validation')
test1['sold'] = eval_preds
evaluation = test1[['id','d','sold']]
evaluation = pd.pivot(evaluation, index='id', columns='d', values='sold').reset_index()
evaluation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
#Remap the category id to their respective categories
evaluation.id = evaluation.id.map(d_id)
#Prepare the submission
submit = pd.concat([validation,evaluation]).reset_index(drop=True)
submit.to_csv('xgboostsub.csv',index=False)
subxg=pd.read_csv('xgboostsub.csv')
subxg # we got the final result to be submitted
#shap
dfshap = pd.read_pickle('data.pkl')
y = 'sold'
X = [name for name in dfshap.columns if name not in [y,'id']]
print('y =', y)
print('X =', X)
#Shap of store CA_1
df0 = dfshap[dfshap['store_id']==0]
validshap0 = df0[(df0['d']>=1914) & (df0['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelCA_1.pkl')
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], validshap0[xgb_model.feature_names])
#Shap of store CA_2
df1 = dfshap[dfshap['store_id']==1]
validshap0 = df1[(df1['d']>=1914) & (df1['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelCA_2.pkl')
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], validshap0[xgb_model.feature_names])
#Shap of store CA_3
df2 = dfshap[dfshap['store_id']==2]
validshap0 = df2[(df2['d']>=1914) & (df2['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelCA_3.pkl')
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], validshap0[xgb_model.feature_names])
#Shap of store CA_4
df3 = dfshap[dfshap['store_id']==3]
validshap0 = df3[(df3['d']>=1914) & (df3['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelCA_4.pkl')
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], validshap0[xgb_model.feature_names])
#Shap of store TX_1
df4 = dfshap[dfshap['store_id']==4]
validshap0 = df4[(df4['d']>=1914) & (df4['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelTX_1.pkl')
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], validshap0[xgb_model.feature_names])
#Shap of store TX_2
df5 = dfshap[dfshap['store_id']==5]
validshap0 = df5[(df5['d']>=1914) & (df5['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelTX_2.pkl')
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], validshap0[xgb_model.feature_names])
#Shap of store TX_3
df6 = dfshap[dfshap['store_id']==6]
validshap0 = df6[(df6['d']>=1914) & (df6['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelTX_3.pkl')
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], validshap0[xgb_model.feature_names])
#Shap of store WI_1
df7 = dfshap[dfshap['store_id']==7]
validshap0 = df7[(df7['d']>=1914) & (df7['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelWI_1.pkl')
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], validshap0[xgb_model.feature_names])
#Shap of store WI_2
df8 = dfshap[dfshap['store_id']==8]
validshap0 = df8[(df8['d']>=1914) & (df8['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelWI_2.pkl')
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], validshap0[xgb_model.feature_names])
#Shap of store WI_3
df9 = dfshap[dfshap['store_id']==9]
validshap0 = df9[(df9['d']>=1914) & (df9['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelWI_3.pkl')
shap_values = xgb_model.predict(dvaild, pred_contribs=True, ntree_limit=xgb_model.best_ntree_limit)
shap.summary_plot(shap_values[:, :-1], validshap0[xgb_model.feature_names])
Note: In above, we mentioned that we had 10 models and corresponding stores. Accordingly, our group select one model(CA_1) as representation, and finish entire works related to ICE and partial dependence in the following. Moreover, the ICE and PDP results of rest 9 models and stores could be completed as the same way, so we decide not to repeat the same work with the rest 9(duplicated but massive work).
dfshap = pd.read_pickle('data.pkl')
y = 'sold'
X = [name for name in dfshap.columns if name not in [y,'id']]
print('y =', y)
print('X =', X)
#split the data
valid = dfshap[(dfshap['d']>=1914) & (dfshap['d']<1942)][['id','d','sold']]
test = dfshap[dfshap['d']>=1942][['id','d','sold']]
eval_preds = test['sold']
valid_preds = valid['sold']
def par_dep(xs, frame, model, resolution=20, bins=None):
""" Creates Pandas DataFrame containing partial dependence for a
single variable.
Args:
xs: Variable for which to calculate partial dependence.
frame: Pandas DataFrame for which to calculate partial dependence.
model: XGBoost model for which to calculate partial dependence.
resolution: The number of points across the domain of xs for which
to calculate partial dependence, default 20.
bins: List of values at which to set xs, default 20 equally-spaced
points between column minimum and maximum.
Returns:
Pandas DataFrame containing partial dependence values.
"""
# turn off pesky Pandas copy warning
pd.options.mode.chained_assignment = None
# initialize empty Pandas DataFrame with correct column names
par_dep_frame = pd.DataFrame(columns=[xs, 'partial_dependence'])
# cache original column values
col_cache = frame.loc[:, xs].copy(deep=True)
# determine values at which to calculate partial dependence
if bins == None:
min_ = frame[xs].min()
max_ = frame[xs].max()
by = (max_ - min_)/resolution
bins = np.arange(min_, max_, by)
# calculate partial dependence
# by setting column of interest to constant
# and scoring the altered data and taking the mean of the predictions
for j in bins:
frame.loc[:, xs] = j
dframe = xgb.DMatrix(frame)
par_dep_i = pd.DataFrame(model.predict(dframe, ntree_limit=model.best_ntree_limit))
par_dep_j = par_dep_i.mean()[0]
par_dep_frame = par_dep_frame.append({xs:j,
'partial_dependence': par_dep_j},
ignore_index=True)
# return input frame to original cached state
frame.loc[:, xs] = col_cache
return par_dep_frame
# PD of store CA_1
df0 = dfshap[dfshap['store_id']==0]
validshap0 = df0[(df0['d']>=1914) & (df0['d']<1942)]
#dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelCA_1.pkl')
par_dep_rolling_sold_mean = par_dep('rolling_sold_mean', validshap0[X], xgb_model) # calculate partial dependence for rolling sold mean value
# display partial dependence for rolling sold mean
par_dep_rolling_sold_mean
par_dep_selling_trend = par_dep('selling_trend', validshap0[X], xgb_model) # calculate partial dependence for selling trend
## display partial dependence for selling trend
par_dep_selling_trend
par_dep_sell_price = par_dep('sell_price', validshap0[X], xgb_model)# calculate partial dependence for sell price
# display partial dependence for sell price
par_dep_sell_price
par_dep_revenue = par_dep('revenue', validshap0[X], xgb_model)# calculate partial dependence for revenue
# display partial dependence for revenue
par_dep_revenue
par_dep_store_item_sold_avg = par_dep('store_item_sold_avg', validshap0[X], xgb_model)# calculate partial dependence for store item sold average
# display partial dependence for store item sold average
par_dep_store_item_sold_avg
def get_percentile_dict(yhat, id_, frame):
""" Returns the percentiles of a column, yhat, as the indices based on
another column id_.
Args:
yhat: Column in which to find percentiles.
id_: Id column that stores indices for percentiles of yhat.
frame: Pandas DataFrame containing yhat and id_.
Returns:
Dictionary of percentile values and index column values.
"""
# create a copy of frame and sort it by yhat
sort_df = frame.copy(deep=True)
sort_df.sort_values(yhat, inplace=True)
sort_df.reset_index(inplace=True)
# find top and bottom percentiles
percentiles_dict = {}
percentiles_dict[0] = sort_df.loc[0, id_]
percentiles_dict[99] = sort_df.loc[sort_df.shape[0]-1, id_]
# find 10th-90th percentiles
inc = sort_df.shape[0]//10
for i in range(1, 10):
percentiles_dict[i * 10] = sort_df.loc[i * inc, id_]
return percentiles_dict
df0 = dfshap[dfshap['store_id']==0]
validshap0 = df0[(df0['d']>=1914) & (df0['d']<1942)]
dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
#dvaild=xgb.DMatrix(validshap0[X], validshap0[y])
xgb_model = joblib.load('xgbmodelCA_1.pkl')
yhat_test = pd.concat([validshap0.reset_index(drop=True), pd.DataFrame(xgb_model.predict(dvaild,
ntree_limit=xgb_model.best_ntree_limit))],
axis=1)
yhat_test
yhat_test = yhat_test.rename(columns={0:'p_sold'})
percentile_dict = get_percentile_dict('p_sold', 'id', yhat_test)
percentile_dict
# retreive bins from original partial dependence calculation
bins_rolling_sold_mean = list(par_dep_rolling_sold_mean['rolling_sold_mean'])
bins_selling_trend = list(par_dep_selling_trend['selling_trend'])
bins_sell_price = list(par_dep_sell_price['sell_price'])
bins_revenue = list(par_dep_revenue['revenue'])
bins_store_item_sold_avg = list(par_dep_store_item_sold_avg['store_item_sold_avg'])
# for each percentile in percentile_dict
# create a new column in the par_dep frame
# representing the ICE curve for that percentile
# and the variables of interest
for i in sorted(percentile_dict.keys()):
col_name = 'Percentile_' + str(i)
# ICE curves for rolling_sold_mean across percentiles at bins_rolling_sold_mean intervals
par_dep_rolling_sold_mean[col_name] = par_dep('rolling_sold_mean',
validshap0[validshap0['id'] == int(percentile_dict[i])][X],
xgb_model,
bins=bins_rolling_sold_mean)['partial_dependence']
# ICE curves for selling_trend across percentiles at bins_selling_trend intervals
par_dep_selling_trend[col_name] = par_dep('selling_trend',
validshap0[validshap0['id'] == int(percentile_dict[i])][X],
xgb_model,
bins=bins_selling_trend)['partial_dependence']
par_dep_sell_price[col_name] = par_dep('sell_price',
validshap0[validshap0['id'] == int(percentile_dict[i])][X],
xgb_model,
bins=bins_sell_price)['partial_dependence']
par_dep_revenue[col_name] = par_dep('revenue',
validshap0[validshap0['id'] == int(percentile_dict[i])][X],
xgb_model,
bins=bins_revenue)['partial_dependence']
par_dep_store_item_sold_avg[col_name] = par_dep('store_item_sold_avg',
validshap0[validshap0['id'] == int(percentile_dict[i])][X],
xgb_model,
bins=bins_store_item_sold_avg)['partial_dependence']
par_dep_rolling_sold_mean
par_dep_selling_trend
par_dep_sell_price
par_dep_revenue
par_dep_store_item_sold_avg
def plot_par_dep_ICE(xs, par_dep_frame):
""" Plots ICE overlayed onto partial dependence for a single variable.
Args:
xs: Name of variable for which to plot ICE and partial dependence.
par_dep_frame: Name of Pandas DataFrame containing ICE and partial
dependence values.
"""
# initialize figure and axis
fig, ax = plt.subplots()
# plot ICE curves
par_dep_frame.drop('partial_dependence', axis=1).plot(x=xs,
colormap='gnuplot',
ax=ax)
# overlay partial dependence, annotate plot
par_dep_frame.plot(title='Partial Dependence and ICE for ' + str(xs),
x=xs,
y='partial_dependence',
style='r-',
linewidth=3,
ax=ax)
# add legend
_ = plt.legend(bbox_to_anchor=(1.05, 0),
loc=3,
borderaxespad=0.)
plot_par_dep_ICE('rolling_sold_mean', par_dep_rolling_sold_mean) # plot partial dependence and ICE for rolling_sold_mean
plot_par_dep_ICE('selling_trend', par_dep_selling_trend) # plot partial dependence and ICE for selling trend
plot_par_dep_ICE('revenue', par_dep_revenue) # plot partial dependence and ICE for revenue
plot_par_dep_ICE('sell_price', par_dep_sell_price) # plot partial dependence and ICE forsell price
plot_par_dep_ICE('store_item_sold_avg', par_dep_store_item_sold_avg) # plot partial dependence and ICE for store item sold average